lend = read.csv("lending_club_loan.csv", header = TRUE)
library(tidyr)
lend = drop_na(lend)
dim(lend)
## [1] 358014     27
colnames(lend)
##  [1] "loan_amnt"            "term"                 "int_rate"            
##  [4] "installment"          "grade"                "sub_grade"           
##  [7] "emp_title"            "emp_length"           "home_ownership"      
## [10] "annual_inc"           "verification_status"  "issue_d"             
## [13] "loan_status"          "purpose"              "title"               
## [16] "dti"                  "earliest_cr_line"     "open_acc"            
## [19] "pub_rec"              "revol_bal"            "revol_util"          
## [22] "total_acc"            "initial_list_status"  "application_type"    
## [25] "mort_acc"             "pub_rec_bankruptcies" "address"
head(lend)
##   loan_amnt       term int_rate installment grade sub_grade
## 1     10000  36 months    11.44      329.48     B        B4
## 2      8000  36 months    11.99      265.68     B        B5
## 3     15600  36 months    10.49      506.97     B        B3
## 4      7200  36 months     6.49      220.65     A        A2
## 5     24375  60 months    17.27      609.33     C        C5
## 6     20000  36 months    13.33      677.07     C        C3
##                 emp_title emp_length home_ownership annual_inc
## 1               Marketing  10+ years           RENT     117000
## 2         Credit analyst     4 years       MORTGAGE      65000
## 3            Statistician   < 1 year           RENT      43057
## 4         Client Advocate    6 years           RENT      54000
## 5 Destiny Management Inc.    9 years       MORTGAGE      55000
## 6           HR Specialist  10+ years       MORTGAGE      86788
##   verification_status  issue_d loan_status            purpose
## 1        Not Verified Jan-2015  Fully Paid           vacation
## 2        Not Verified Jan-2015  Fully Paid debt_consolidation
## 3     Source Verified Jan-2015  Fully Paid        credit_card
## 4        Not Verified Nov-2014  Fully Paid        credit_card
## 5            Verified Apr-2013 Charged Off        credit_card
## 6            Verified Sep-2015  Fully Paid debt_consolidation
##                     title   dti earliest_cr_line open_acc pub_rec revol_bal
## 1                Vacation 26.24         Jun-1990       16       0     36369
## 2      Debt consolidation 22.05         Jul-2004       17       0     20131
## 3 Credit card refinancing 12.79         Aug-2007       13       0     11987
## 4 Credit card refinancing  2.60         Sep-2006        6       0      5472
## 5   Credit Card Refinance 33.95         Mar-1999       13       0     24584
## 6      Debt consolidation 16.31         Jan-2005        8       0     25757
##   revol_util total_acc initial_list_status application_type mort_acc
## 1       41.8        25                   w       INDIVIDUAL        0
## 2       53.3        27                   f       INDIVIDUAL        3
## 3       92.2        26                   f       INDIVIDUAL        0
## 4       21.5        13                   f       INDIVIDUAL        0
## 5       69.8        43                   f       INDIVIDUAL        1
## 6      100.6        23                   f       INDIVIDUAL        4
##   pub_rec_bankruptcies
## 1                    0
## 2                    0
## 3                    0
## 4                    0
## 5                    0
## 6                    0
##                                                     address
## 1              0174 Michelle Gateway\nMendozaberg, OK 22690
## 2           1076 Carney Fort Apt. 347\nLoganmouth, SD 05113
## 3           87025 Mark Dale Apt. 269\nNew Sabrina, WV 05113
## 4                     823 Reid Ford\nDelacruzside, MA 00813
## 5                      679 Luna Roads\nGreggshire, VA 11650
## 6 1726 Cooper Passage Suite 129\nNorth Deniseberg, DE 30723
library(DescTools)
Desc(lend)
## ------------------------------------------------------------------------------ 
## Describe lend (data.frame):
## 
## data frame:  358014 obs. of  27 variables
##      358014 complete cases (100.0%)
## 
##   Nr  ColName               Class    NAs  Levels                           
##   1   loan_amnt             numeric  .                                     
##   2   term                  factor   .    (2): 1- 36 months, 2- 60 months  
##   3   int_rate              numeric  .                                     
##   4   installment           numeric  .                                     
##   5   grade                 factor   .    (7): 1-A, 2-B, 3-C, 4-D, 5-E, ...
##   6   sub_grade             factor   .    (35): 1-A1, 2-A2, 3-A3, 4-A4,    
##                                           5-A5, ...                        
##   7   emp_title             factor   .    (173106): 1-, 2- NSA Industries  
##                                           llc, 3- Fibro Source, 4- Long    
##                                           Ilsand College Hospital, 5-      
##                                           mortgage banker, ...             
##   8   emp_length            factor   .    (12): 1-, 2-< 1 year, 3-1 year,  
##                                           4-10+ years, 5-2 years, ...      
##   9   home_ownership        factor   .    (6): 1-ANY, 2-MORTGAGE, 3-NONE,  
##                                           4-OTHER, 5-OWN, ...              
##   10  annual_inc            numeric  .                                     
##   11  verification_status   factor   .    (3): 1-Not Verified, 2-Source    
##                                           Verified, 3-Verified             
##   12  issue_d               factor   .    (115): 1-Apr-2008, 2-Apr-2009,   
##                                           3-Apr-2010, 4-Apr-2011,          
##                                           5-Apr-2012, ...                  
##   13  loan_status           factor   .    (2): 1-Charged Off, 2-Fully Paid 
##   14  purpose               factor   .    (14): 1-car, 2-credit_card,      
##                                           3-debt_consolidation,            
##                                           4-educational,                   
##                                           5-home_improvement, ...          
##   15  title                 factor   .    (48818): 1-, 2- credit_card, 3-  
##                                           debt_consolidation, 4- other, 5- 
##                                           small_business, ...              
##   16  dti                   numeric  .                                     
##   17  earliest_cr_line      factor   .    (684): 1-Apr-1955, 2-Apr-1958,   
##                                           3-Apr-1960, 4-Apr-1961,          
##                                           5-Apr-1962, ...                  
##   18  open_acc              numeric  .                                     
##   19  pub_rec               numeric  .                                     
##   20  revol_bal             numeric  .                                     
##   21  revol_util            numeric  .                                     
##   22  total_acc             numeric  .                                     
##   23  initial_list_status   factor   .    (2): 1-f, 2-w                    
##   24  application_type      factor   .    (3): 1-DIRECT_PAY, 2-INDIVIDUAL, 
##                                           3-JOINT                          
##   25  mort_acc              numeric  .                                     
##   26  pub_rec_bankruptcies  numeric  .                                     
##   27  address               factor   .    (393700): 1-000 Adam Station Apt.
##                                           329 Ashleyberg, AZ 22690, 2-000  
##                                           Adrian Cliffs Randyton, LA 22690,
##                                           3-000 Alexandria Street Port     
##                                           Richard, FL 22690, 4-000 Amber   
##                                           Court Lake Pamelatown, IN 00813, 
##                                           5-000 Amy Pines Suite 498 South  
##                                           Susan, ND 22690, ...             
## 
## 
## ------------------------------------------------------------------------------ 
## 1 - loan_amnt (numeric)
## 
##      length         n       NAs     unique         0s       mean     meanCI'
##     358'014   358'014         0      1'390          0  14'386.90  14'359.43
##                100.0%      0.0%                  0.0%             14'414.37
##                                                                            
##         .05       .10       .25     median        .75        .90        .95
##    3'500.00  5'000.00  8'000.00  12'000.00  20'000.00  27'000.00  31'825.00
##                                                                            
##       range        sd     vcoef        mad        IQR       skew       kurt
##   39'000.00  8'385.10      0.58   8'154.30  12'000.00       0.75      -0.12
##                                                                            
## lowest : 1'000.0 (1'155), 1'025.0 (4), 1'050.0 (6), 1'075.0 (5), 1'100.0 (34)
## highest: 39'475.0, 39'500.0, 39'600.0, 39'700.0, 40'000.0 (179)
## 
## heap(?): remarkable frequency (6.9%) for the mode(s) (= 10000)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 2 - term (factor - dichotomous)
## 
##    length       n     NAs  unique
##   358'014 358'014       0       2
##            100.0%    0.0%        
## 
##                freq   perc  lci.95  uci.95'
##  36 months  273'605  76.4%   76.3%   76.6%
##  60 months   84'409  23.6%   23.4%   23.7%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 3 - int_rate (numeric)
## 
##    length        n    NAs  unique     0s   mean  meanCI'
##   358'014  358'014      0     265      0  13.80   13.79
##             100.0%   0.0%           0.0%          13.82
##                                                        
##       .05      .10    .25  median    .75    .90     .95
##      6.92     7.90  10.74   13.44  16.78  19.72   21.99
##                                                        
##     range       sd  vcoef     mad    IQR   skew    kurt
##     25.67     4.50   0.33    4.61   6.04   0.41   -0.17
##                                                        
## lowest : 5.32 (2'440), 5.93 (431), 6.0 (56), 6.03 (5'675), 6.24 (1'184)
## highest: 30.79 (9), 30.84, 30.89 (3), 30.94 (3), 30.99 (13)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 4 - installment (numeric)
## 
##      length        n      NAs   unique       0s     mean   meanCI'
##     358'014  358'014        0   50'208        0  441.818  440.993
##               100.0%     0.0%              0.0%           442.643
##                                                                  
##         .05      .10      .25   median      .75      .90      .95
##     118.693  166.050  260.460  385.120  580.450  794.650  939.004
##                                                                  
##       range       sd    vcoef      mad      IQR     skew     kurt
##   1'512.190  251.912    0.570  223.257  319.990    0.969    0.733
##                                                                  
## lowest : 21.62, 23.61, 28.75, 28.82, 29.52
## highest: 1'464.420, 1'479.490, 1'503.850, 1'527.0, 1'533.810
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 5 - grade (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0       7       7       y
##            100.0%    0.0%                        
## 
##    level     freq   perc  cumfreq  cumperc
## 1      B  104'416  29.2%  104'416    29.2%
## 2      C   98'353  27.5%  202'769    56.6%
## 3      D   58'558  16.4%  261'327    73.0%
## 4      A   54'255  15.2%  315'582    88.1%
## 5      E   28'871   8.1%  344'453    96.2%
## 6      F   10'792   3.0%  355'245    99.2%
## 7      G    2'769   0.8%  358'014   100.0%

## ------------------------------------------------------------------------------ 
## 6 - sub_grade (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0      35      35       y
##            100.0%    0.0%                        
## 
##     level    freq  perc  cumfreq  cumperc
## 1      B3  23'768  6.6%   23'768     6.6%
## 2      B4  23'219  6.5%   46'987    13.1%
## 3      C1  21'612  6.0%   68'599    19.2%
## 4      C2  20'617  5.8%   89'216    24.9%
## 5      B2  20'491  5.7%  109'707    30.6%
## 6      C3  19'840  5.5%  129'547    36.2%
## 7      B5  19'557  5.5%  149'104    41.6%
## 8      C4  19'143  5.3%  168'247    47.0%
## 9      B1  17'381  4.9%  185'628    51.8%
## 10     C5  17'141  4.8%  202'769    56.6%
## 11     A5  15'925  4.4%  218'694    61.1%
## 12     D1  15'081  4.2%  233'775    65.3%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 7 - emp_title (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0 151'302 173'106       y
##            100.0%    0.0%                        
## 
##                level    freq  perc  cumfreq  cumperc
## 1                     20'486  5.7%   20'486     5.7%
## 2            Teacher   4'387  1.2%   24'873     6.9%
## 3            Manager   4'249  1.2%   29'122     8.1%
## 4   Registered Nurse   1'855  0.5%   30'977     8.7%
## 5                 RN   1'844  0.5%   32'821     9.2%
## 6         Supervisor   1'830  0.5%   34'651     9.7%
## 7              Sales   1'636  0.5%   36'287    10.1%
## 8    Project Manager   1'503  0.4%   37'790    10.6%
## 9              Owner   1'410  0.4%   39'200    10.9%
## 10            Driver   1'337  0.4%   40'537    11.3%
## 11    Office Manager   1'217  0.3%   41'754    11.7%
## 12           manager   1'145  0.3%   42'899    12.0%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 8 - emp_length (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0      12      12       y
##            100.0%    0.0%                        
## 
##         level     freq   perc  cumfreq  cumperc
## 1   10+ years  117'323  32.8%  117'323    32.8%
## 2     2 years   31'720   8.9%  149'043    41.6%
## 3     3 years   27'866   7.8%  176'909    49.4%
## 4    < 1 year   27'538   7.7%  204'447    57.1%
## 5     5 years   23'345   6.5%  227'792    63.6%
## 6      1 year   22'841   6.4%  250'633    70.0%
## 7     4 years   20'656   5.8%  271'289    75.8%
## 8     7 years   19'038   5.3%  290'327    81.1%
## 9     6 years   18'629   5.2%  308'956    86.3%
## 10    8 years   17'735   5.0%  326'691    91.3%
## 11              17'239   4.8%  343'930    96.1%
## 12    9 years   14'084   3.9%  358'014   100.0%

## ------------------------------------------------------------------------------ 
## 9 - home_ownership (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0       6       6       y
##            100.0%    0.0%                        
## 
##       level     freq   perc  cumfreq  cumperc
## 1  MORTGAGE  181'592  50.7%  181'592    50.7%
## 2      RENT  141'604  39.6%  323'196    90.3%
## 3       OWN   34'752   9.7%  357'948   100.0%
## 4     OTHER       34   0.0%  357'982   100.0%
## 5      NONE       29   0.0%  358'011   100.0%
## 6       ANY        3   0.0%  358'014   100.0%

## ------------------------------------------------------------------------------ 
## 10 - annual_inc (numeric)
## 
##         length          n        NAs     unique         0s        mean'
##        358'014    358'014          0     24'723          1   74'746.46
##                    100.0%       0.0%                  0.0%            
##                                                                       
##            .05        .10        .25     median        .75         .90
##      28'800.00  35'000.00  45'400.75  65'000.00  90'000.00  122'000.00
##                                                                       
##          range         sd      vcoef        mad        IQR        skew
##   8'706'582.00  61'407.27       0.82  29'652.00  44'599.25       42.14
##                                                                       
##       meanCI
##    74'545.31
##    74'947.61
##             
##          .95
##   150'000.00
##             
##         kurt
##     4'462.80
##             
## lowest : 0.0, 600.0, 2'500.0, 4'000.0, 4'524.0
## highest: 7'000'000.0, 7'141'778.0, 7'446'395.0, 7'600'000.0, 8'706'582.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 11 - verification_status (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0       3       3       y
##            100.0%    0.0%                        
## 
##              level     freq   perc  cumfreq  cumperc
## 1         Verified  127'154  35.5%  127'154    35.5%
## 2  Source Verified  121'220  33.9%  248'374    69.4%
## 3     Not Verified  109'640  30.6%  358'014   100.0%

## ------------------------------------------------------------------------------ 
## 12 - issue_d (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0      58     115       y
##            100.0%    0.0%                        
## 
##        level    freq  perc  cumfreq  cumperc
## 1   Oct-2014  14'838  4.1%   14'838     4.1%
## 2   Jul-2014  12'597  3.5%   27'435     7.7%
## 3   Jan-2015  11'701  3.3%   39'136    10.9%
## 4   Dec-2013  10'609  3.0%   49'745    13.9%
## 5   Nov-2013  10'492  2.9%   60'237    16.8%
## 6   Jul-2015  10'260  2.9%   70'497    19.7%
## 7   Oct-2013  10'040  2.8%   80'537    22.5%
## 8   Jan-2014   9'702  2.7%   90'239    25.2%
## 9   Apr-2015   9'466  2.6%   99'705    27.8%
## 10  Sep-2013   9'172  2.6%  108'877    30.4%
## 11  Aug-2013   9'100  2.5%  117'977    33.0%
## 12  Apr-2014   9'012  2.5%  126'989    35.5%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 13 - loan_status (factor - dichotomous)
## 
##    length       n     NAs  unique
##   358'014 358'014       0       2
##            100.0%    0.0%        
## 
##                 freq   perc  lci.95  uci.95'
## Charged Off   72'078  20.1%   20.0%   20.3%
## Fully Paid   285'936  79.9%   79.7%   80.0%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 14 - purpose (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0      14      14       y
##            100.0%    0.0%                        
## 
##                  level     freq   perc  cumfreq  cumperc
## 1   debt_consolidation  216'366  60.4%  216'366    60.4%
## 2          credit_card   77'681  21.7%  294'047    82.1%
## 3     home_improvement   21'327   6.0%  315'374    88.1%
## 4                other   17'542   4.9%  332'916    93.0%
## 5       major_purchase    6'838   1.9%  339'754    94.9%
## 6       small_business    3'939   1.1%  343'693    96.0%
## 7              medical    3'559   1.0%  347'252    97.0%
## 8                  car    3'282   0.9%  350'534    97.9%
## 9               moving    2'343   0.7%  352'877    98.6%
## 10            vacation    2'120   0.6%  354'997    99.2%
## 11               house    1'819   0.5%  356'816    99.7%
## 12             wedding      956   0.3%  357'772    99.9%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 15 - title (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0  34'276  48'818       y
##            100.0%    0.0%                        
## 
##                       level     freq   perc  cumfreq  cumperc
## 1        Debt consolidation  152'037  42.5%  152'037    42.5%
## 2   Credit card refinancing   51'470  14.4%  203'507    56.8%
## 3          Home improvement   15'185   4.2%  218'692    61.1%
## 4                     Other   12'841   3.6%  231'533    64.7%
## 5        Debt Consolidation    9'292   2.6%  240'825    67.3%
## 6            Major purchase    4'751   1.3%  245'576    68.6%
## 7             Consolidation    3'338   0.9%  248'914    69.5%
## 8        debt consolidation    3'024   0.8%  251'938    70.4%
## 9                  Business    2'906   0.8%  254'844    71.2%
## 10         Medical expenses    2'729   0.8%  257'573    71.9%
## 11            Car financing    2'134   0.6%  259'707    72.5%
## 12                             1'744   0.5%  261'451    73.0%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 16 - dti (numeric)
## 
##     length        n    NAs  unique     0s    mean      meanCI'
##    358'014  358'014      0   4'262    139   17.79       17.73
##              100.0%   0.0%           0.0%               17.85
##                                                              
##        .05      .10    .25  median    .75     .90         .95
##       5.04     7.26  11.63   17.29  23.49   28.96       31.97
##                                                              
##      range       sd  vcoef     mad    IQR    skew        kurt
##   9'999.00    18.78   1.06    8.76  11.86  421.09  222'991.82
##                                                              
## lowest : 0.0 (139), 0.01 (5), 0.02 (7), 0.03 (3), 0.04 (3)
## highest: 145.65, 189.9, 380.53, 1'622.0, 9'999.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 17 - earliest_cr_line (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0     680     684       y
##            100.0%    0.0%                        
## 
##        level   freq  perc  cumfreq  cumperc
## 1   Aug-2001  2'680  0.7%    2'680     0.7%
## 2   Oct-2000  2'678  0.7%    5'358     1.5%
## 3   Aug-2000  2'671  0.7%    8'029     2.2%
## 4   Oct-2001  2'630  0.7%   10'659     3.0%
## 5   Aug-2002  2'437  0.7%   13'096     3.7%
## 6   Sep-2000  2'416  0.7%   15'512     4.3%
## 7   Nov-2000  2'412  0.7%   17'924     5.0%
## 8   Oct-1999  2'392  0.7%   20'316     5.7%
## 9   Oct-2002  2'392  0.7%   22'708     6.3%
## 10  Nov-1999  2'388  0.7%   25'096     7.0%
## 11  Aug-1999  2'314  0.6%   27'410     7.7%
## 12  Sep-2002  2'308  0.6%   29'718     8.3%
## ... etc.
##  [list output truncated]

## ------------------------------------------------------------------------------ 
## 18 - open_acc (numeric)
## 
##    length        n    NAs  unique     0s   mean  meanCI'
##   358'014  358'014      0      60      0  11.52   11.50
##             100.0%   0.0%           0.0%          11.54
##                                                        
##       .05      .10    .25  median    .75    .90     .95
##      5.00     6.00   8.00   11.00  14.00  18.00   21.00
##                                                        
##     range       sd  vcoef     mad    IQR   skew    kurt
##     89.00     5.17   0.45    4.45   6.00   1.23    3.01
##                                                        
## lowest : 1.0 (71), 2.0 (879), 3.0 (3'410), 4.0 (8'559), 5.0 (15'383)
## highest: 56.0 (2), 57.0, 58.0, 76.0 (2), 90.0
## 
## heap(?): remarkable frequency (9.3%) for the mode(s) (= 9)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 19 - pub_rec (numeric)
## 
##    length        n    NAs  unique       0s   mean    meanCI'
##   358'014  358'014      0      20  302'144   0.19      0.19
##             100.0%   0.0%            84.4%             0.19
##                                                            
##       .05      .10    .25  median      .75    .90       .95
##      0.00     0.00   0.00    0.00     0.00   1.00      1.00
##                                                            
##     range       sd  vcoef     mad      IQR   skew      kurt
##     86.00     0.55   2.88    0.00     0.00  16.25  1'770.23
##                                                            
## lowest : 0.0 (302'144), 1.0 (47'904), 2.0 (5'431), 3.0 (1'514), 4.0 (526)
## highest: 17.0, 19.0 (2), 24.0, 40.0, 86.0
## 
## heap(?): remarkable frequency (84.4%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 20 - revol_bal (numeric)
## 
##         length          n       NAs     unique         0s       mean     meanCI'
##        358'014    358'014         0     54'104      1'091  16'088.61  16'019.73
##                    100.0%      0.0%                  0.3%             16'157.50
##                                                                                
##            .05        .10       .25     median        .75        .90        .95
##       1'961.00   3'336.00  6'222.00  11'363.00  19'852.00  31'685.70  41'119.35
##                                                                                
##          range         sd     vcoef        mad        IQR       skew       kurt
##   1'743'266.00  21'029.89      1.31   9'052.76  13'630.00      12.02     389.58
##                                                                                
## lowest : 0.0 (1'091), 1.0 (19), 2.0 (21), 3.0 (21), 4.0 (17)
## highest: 1'023'940.0, 1'030'826.0, 1'190'046.0, 1'298'783.0, 1'743'266.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 21 - revol_util (numeric)
## 
##    length        n    NAs  unique     0s   mean  meanCI'
##   358'014  358'014      0   1'160  1'347  54.19   54.11
##             100.0%   0.0%           0.4%          54.27
##                                                        
##       .05      .10    .25  median    .75    .90     .95
##     12.70    21.00  36.60   55.10  72.80  86.00   91.90
##                                                        
##     range       sd  vcoef     mad    IQR   skew    kurt
##    892.30    24.00   0.44   26.84  36.20  -0.05    3.39
##                                                        
## lowest : 0.0 (1'347), 0.1 (197), 0.2 (161), 0.3 (146), 0.4 (148)
## highest: 148.0, 150.7, 152.5, 153.0, 892.3
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 22 - total_acc (numeric)
## 
##    length        n    NAs  unique     0s   mean  meanCI'
##   358'014  358'014      0     118      0  25.77   25.73
##             100.0%   0.0%           0.0%          25.81
##                                                        
##       .05      .10    .25  median    .75    .90     .95
##      9.00    12.00  17.00   24.00  33.00  42.00   48.00
##                                                        
##     range       sd  vcoef     mad    IQR   skew    kurt
##    149.00    11.90   0.46   11.86  16.00   0.87    1.25
##                                                        
## lowest : 2.0 (16), 3.0 (172), 4.0 (869), 5.0 (1'518), 6.0 (2'316)
## highest: 124.0, 129.0, 135.0, 150.0, 151.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 23 - initial_list_status (factor - dichotomous)
## 
##    length       n     NAs  unique
##   358'014 358'014       0       2
##            100.0%    0.0%        
## 
##       freq   perc  lci.95  uci.95'
## f  200'141  55.9%   55.7%   56.1%
## w  157'873  44.1%   43.9%   44.3%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 24 - application_type (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0       3       3       y
##            100.0%    0.0%                        
## 
##         level     freq   perc  cumfreq  cumperc
## 1  INDIVIDUAL  357'303  99.8%  357'303    99.8%
## 2       JOINT      425   0.1%  357'728    99.9%
## 3  DIRECT_PAY      286   0.1%  358'014   100.0%

## ------------------------------------------------------------------------------ 
## 25 - mort_acc (numeric)
## 
##    length        n    NAs  unique       0s  mean  meanCI'
##   358'014  358'014      0      33  139'708  1.81    1.81
##             100.0%   0.0%            39.0%          1.82
##                                                         
##       .05      .10    .25  median      .75   .90     .95
##      0.00     0.00   0.00    1.00     3.00  5.00    6.00
##                                                         
##     range       sd  vcoef     mad      IQR  skew    kurt
##     34.00     2.15   1.18    1.48     3.00  1.60    4.48
##                                                         
## lowest : 0.0 (139'708), 1.0 (60'384), 2.0 (49'907), 3.0 (38'022), 4.0 (27'865)
## highest: 28.0, 30.0, 31.0 (2), 32.0 (2), 34.0
## 
## heap(?): remarkable frequency (39.0%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 26 - pub_rec_bankruptcies (numeric)
## 
##    length        n    NAs  unique       0s  mean  meanCI'
##   358'014  358'014      0       9  314'399  0.13    0.13
##             100.0%   0.0%            87.8%          0.13
##                                                         
##       .05      .10    .25  median      .75   .90     .95
##      0.00     0.00   0.00    0.00     0.00  1.00    1.00
##                                                         
##     range       sd  vcoef     mad      IQR  skew    kurt
##      8.00     0.37   2.83    0.00     0.00  3.31   17.04
##                                                         
## 
##    level     freq   perc  cumfreq  cumperc
## 1      0  314'399  87.8%  314'399    87.8%
## 2      1   41'297  11.5%  355'696    99.4%
## 3      2    1'840   0.5%  357'536    99.9%
## 4      3      351   0.1%  357'887   100.0%
## 5      4       82   0.0%  357'969   100.0%
## 6      5       32   0.0%  358'001   100.0%
## 7      6        7   0.0%  358'008   100.0%
## 8      7        4   0.0%  358'012   100.0%
## 9      8        2   0.0%  358'014   100.0%
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 27 - address (factor)
## 
##    length       n     NAs  unique  levels   dupes
##   358'014 358'014       0 356'061 393'700       y
##            100.0%    0.0%                        
## 
##                          level  freq  perc  cumfreq  cumperc
## 1   USNS Johnson\nFPO AE 05113     8  0.0%        8     0.0%
## 2    USS Johnson\nFPO AE 48052     7  0.0%       15     0.0%
## 3      USS Smith\nFPO AP 70466     7  0.0%       22     0.0%
## 4    USCGC Jones\nFPO AE 22690     6  0.0%       28     0.0%
## 5   USCGC Miller\nFPO AA 22690     6  0.0%       34     0.0%
## 6    USCGC Smith\nFPO AA 70466     6  0.0%       40     0.0%
## 7   USNS Johnson\nFPO AA 70466     6  0.0%       46     0.0%
## 8   USNS Johnson\nFPO AP 48052     6  0.0%       52     0.0%
## 9     USNV Brown\nFPO AA 48052     6  0.0%       58     0.0%
## 10    USNV Smith\nFPO AA 00813     6  0.0%       64     0.0%
## 11     USS Smith\nFPO AP 22690     6  0.0%       70     0.0%
## 12   USCGC Brown\nFPO AA 30723     5  0.0%       75     0.0%
## ... etc.
##  [list output truncated]

summary(lend)
##    loan_amnt             term           int_rate      installment     
##  Min.   : 1000    36 months:273605   Min.   : 5.32   Min.   :  21.62  
##  1st Qu.: 8000    60 months: 84409   1st Qu.:10.74   1st Qu.: 260.46  
##  Median :12000                       Median :13.44   Median : 385.12  
##  Mean   :14387                       Mean   :13.80   Mean   : 441.82  
##  3rd Qu.:20000                       3rd Qu.:16.78   3rd Qu.: 580.45  
##  Max.   :40000                       Max.   :30.99   Max.   :1533.81  
##                                                                       
##  grade        sub_grade                 emp_title          emp_length    
##  A: 54255   B3     : 23768                   : 20486   10+ years:117323  
##  B:104416   B4     : 23219   Teacher         :  4387   2 years  : 31720  
##  C: 98353   C1     : 21612   Manager         :  4249   3 years  : 27866  
##  D: 58558   C2     : 20617   Registered Nurse:  1855   < 1 year : 27538  
##  E: 28871   B2     : 20491   RN              :  1844   5 years  : 23345  
##  F: 10792   C3     : 19840   Supervisor      :  1830   1 year   : 22841  
##  G:  2769   (Other):228467   (Other)         :323363   (Other)  :107381  
##   home_ownership     annual_inc           verification_status     issue_d      
##  ANY     :     3   Min.   :      0   Not Verified   :109640   Oct-2014: 14838  
##  MORTGAGE:181592   1st Qu.:  45401   Source Verified:121220   Jul-2014: 12597  
##  NONE    :    29   Median :  65000   Verified       :127154   Jan-2015: 11701  
##  OTHER   :    34   Mean   :  74746                            Dec-2013: 10609  
##  OWN     : 34752   3rd Qu.:  90000                            Nov-2013: 10492  
##  RENT    :141604   Max.   :8706582                            Jul-2015: 10260  
##                                                               (Other) :287517  
##       loan_status                   purpose      
##  Charged Off: 72078   debt_consolidation:216366  
##  Fully Paid :285936   credit_card       : 77681  
##                       home_improvement  : 21327  
##                       other             : 17542  
##                       major_purchase    :  6838  
##                       small_business    :  3939  
##                       (Other)           : 14321  
##                      title             dti          earliest_cr_line 
##  Debt consolidation     :152037   Min.   :   0.00   Aug-2001:  2680  
##  Credit card refinancing: 51470   1st Qu.:  11.63   Oct-2000:  2678  
##  Home improvement       : 15185   Median :  17.29   Aug-2000:  2671  
##  Other                  : 12841   Mean   :  17.79   Oct-2001:  2630  
##  Debt Consolidation     :  9292   3rd Qu.:  23.49   Aug-2002:  2437  
##  Major purchase         :  4751   Max.   :9999.00   Sep-2000:  2416  
##  (Other)                :112438                     (Other) :342502  
##     open_acc        pub_rec          revol_bal         revol_util    
##  Min.   : 1.00   Min.   : 0.0000   Min.   :      0   Min.   :  0.00  
##  1st Qu.: 8.00   1st Qu.: 0.0000   1st Qu.:   6222   1st Qu.: 36.60  
##  Median :11.00   Median : 0.0000   Median :  11363   Median : 55.10  
##  Mean   :11.52   Mean   : 0.1917   Mean   :  16089   Mean   : 54.19  
##  3rd Qu.:14.00   3rd Qu.: 0.0000   3rd Qu.:  19852   3rd Qu.: 72.80  
##  Max.   :90.00   Max.   :86.0000   Max.   :1743266   Max.   :892.30  
##                                                                      
##    total_acc      initial_list_status   application_type     mort_acc     
##  Min.   :  2.00   f:200141            DIRECT_PAY:   286   Min.   : 0.000  
##  1st Qu.: 17.00   w:157873            INDIVIDUAL:357303   1st Qu.: 0.000  
##  Median : 24.00                       JOINT     :   425   Median : 1.000  
##  Mean   : 25.77                                           Mean   : 1.814  
##  3rd Qu.: 33.00                                           3rd Qu.: 3.000  
##  Max.   :151.00                                           Max.   :34.000  
##                                                                           
##  pub_rec_bankruptcies                      address       
##  Min.   :0.0000       USNS Johnson\nFPO AE 05113:     8  
##  1st Qu.:0.0000       USS Johnson\nFPO AE 48052 :     7  
##  Median :0.0000       USS Smith\nFPO AP 70466   :     7  
##  Mean   :0.1302       USCGC Jones\nFPO AE 22690 :     6  
##  3rd Qu.:0.0000       USCGC Miller\nFPO AA 22690:     6  
##  Max.   :8.0000       USCGC Smith\nFPO AA 70466 :     6  
##                       (Other)                   :357974
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
lendsub = select(lend, c('loan_amnt','int_rate','installment','annual_inc',
                         'dti','open_acc','pub_rec','revol_bal','revol_util','total_acc',
                         'mort_acc','pub_rec_bankruptcies'))
stat.desc(lendsub)
##                 loan_amnt     int_rate  installment   annual_inc          dti
## nbr.val      3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05
## nbr.null     0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.390000e+02
## nbr.na       0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min          1.000000e+03 5.320000e+00 2.162000e+01 0.000000e+00 0.000000e+00
## max          4.000000e+04 3.099000e+01 1.533810e+03 8.706582e+06 9.999000e+03
## range        3.900000e+04 2.567000e+01 1.512190e+03 8.706582e+06 9.999000e+03
## sum          5.150712e+09 4.942115e+06 1.581771e+08 2.676028e+10 6.368604e+06
## median       1.200000e+04 1.344000e+01 3.851200e+02 6.500000e+04 1.729000e+01
## mean         1.438690e+04 1.380425e+01 4.418181e+02 7.474646e+04 1.778870e+01
## SE.mean      1.401388e+01 7.528314e-03 4.210157e-01 1.026289e+02 3.138754e-02
## CI.mean.0.95 2.746679e+01 1.475527e-02 8.251784e-01 2.011497e+02 6.151865e-02
## var          7.030995e+07 2.029063e+01 6.345949e+04 3.770853e+09 3.527073e+02
## std.dev      8.385103e+03 4.504512e+00 2.519117e+02 6.140727e+04 1.878050e+01
## coef.var     5.828289e-01 3.263134e-01 5.701705e-01 8.215409e-01 1.055755e+00
##                  open_acc      pub_rec    revol_bal   revol_util    total_acc
## nbr.val      3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05 3.580140e+05
## nbr.null     0.000000e+00 3.021440e+05 1.091000e+03 1.347000e+03 0.000000e+00
## nbr.na       0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min          1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00
## max          9.000000e+01 8.600000e+01 1.743266e+06 8.923000e+02 1.510000e+02
## range        8.900000e+01 8.600000e+01 1.743266e+06 8.923000e+02 1.490000e+02
## sum          4.124980e+06 6.861900e+04 5.759949e+09 1.940195e+07 9.225534e+06
## median       1.100000e+01 0.000000e+00 1.136300e+04 5.510000e+01 2.400000e+01
## mean         1.152184e+01 1.916657e-01 1.608861e+04 5.419326e+01 2.576864e+01
## SE.mean      8.639367e-03 9.215629e-04 3.514689e+01 4.010819e-02 1.988711e-02
## CI.mean.0.95 1.693291e-02 1.806236e-03 6.888688e+01 7.861087e-02 3.897815e-02
## var          2.672169e+01 3.040535e-01 4.422561e+08 5.759252e+02 1.415935e+02
## std.dev      5.169302e+00 5.514104e-01 2.102989e+04 2.399844e+01 1.189931e+01
## coef.var     4.486525e-01 2.876939e+00 1.307129e+00 4.428307e-01 4.617747e-01
##                  mort_acc pub_rec_bankruptcies
## nbr.val      3.580140e+05         3.580140e+05
## nbr.null     1.397080e+05         3.143990e+05
## nbr.na       0.000000e+00         0.000000e+00
## min          0.000000e+00         0.000000e+00
## max          3.400000e+01         8.000000e+00
## range        3.400000e+01         8.000000e+00
## sum          6.493630e+05         4.660400e+04
## median       1.000000e+00         0.000000e+00
## mean         1.813792e+00         1.301737e-01
## SE.mean      3.589744e-03         6.147393e-04
## CI.mean.0.95 7.035793e-03         1.204871e-03
## var          4.613462e+00         1.352950e-01
## std.dev      2.147897e+00         3.678248e-01
## coef.var     1.184202e+00         2.825646e+00
library(ggplot2)
lend = drop_na(lend)
qplot(lend$loan_amnt,
      geom="histogram",  
      main="Histogram for loan Amount", 
      xlab="Loan Amount",
      fill=I("lightblue"), 
      col=I("black"), 
      binwidth=1000)

par(mfrow = c(2,2))
#specify the margin
par(mar = rep(2, 4))
hist(lend$int_rate, ylim = c(0,40000), col = 'lightblue', main = 'Interest Rate')
hist(lend$installment, ylim = c(0,80000),col = 'lightblue', main = 'Installment')
plot(lend$term, ylim = c(0,350000), col = 'lightblue', main = 'Loan Term')
plot(lend$loan_status, ylim = c(0,350000), col = 'lightblue', main = 'Loan Status')

ggplot(lend, aes(x = grade, y = loan_amnt)) + 
  geom_boxplot(fill = "steelblue3", colour = "black", 
  outlier.colour = "black", outlier.shape = 1) +
  labs(title="Loan Amount by Grade", x = "Grade", y = "Loan Amount \n")

ggplot(lend, aes(grade, int_rate)) +
  geom_boxplot(fill = "steelblue3", colour = "black", 
               outlier.colour = "black", outlier.shape = 1) +
  labs(title ="Interest Rate by Grade", x = "Grade", y = "Interest Rate \n")

ggplot(lend, aes(home_ownership, int_rate)) +
  geom_boxplot(fill = "steelblue3", colour = "black", 
               outlier.colour = "black", outlier.shape = 1) +
  labs(title="Interest Rate by Home Ownership", x = "Home Ownership", y = "Interest Rate \n")

ggplot(lend, aes(term, loan_amnt)) +
  geom_boxplot(fill = "steelblue3", colour = "black", 
               outlier.colour = "black", outlier.shape = 1) +
  labs(title="Loan Amount by Term", x = "Term", y = "Loan Amount \n")

table(lend$purpose)
## 
##                car        credit_card debt_consolidation        educational 
##               3282              77681             216366                  1 
##   home_improvement              house     major_purchase            medical 
##              21327               1819               6838               3559 
##             moving              other   renewable_energy     small_business 
##               2343              17542                241               3939 
##           vacation            wedding 
##               2120                956
info = c(4697, 83019, 234507, 257, 24030, 2201, 8790, 4196, 2854, 21185, 329, 5701, 2452, 1812)
names = c("car", "credit_card", "debt_consolidation", "educational", "home_improvement", "house", "major_purchase", "medical", "moving", "other", "renewable_energy", "small_business", "vacation", "wedding")
pie(info, labels=names, main = "Purpose")
legend("topright", names, cex=0.5, fill = rainbow(length(info)))

library(ggcorrplot)
df <- dplyr::select_if(lend, is.numeric)
r <- cor(df, use="complete.obs")
round(r,2)
##                      loan_amnt int_rate installment annual_inc   dti open_acc
## loan_amnt                 1.00     0.15        0.96       0.34  0.01     0.19
## int_rate                  0.15     1.00        0.14      -0.07  0.07     0.00
## installment               0.96     0.14        1.00       0.34  0.01     0.18
## annual_inc                0.34    -0.07        0.34       1.00 -0.08     0.13
## dti                       0.01     0.07        0.01      -0.08  1.00     0.13
## open_acc                  0.19     0.00        0.18       0.13  0.13     1.00
## pub_rec                  -0.09     0.05       -0.08      -0.02 -0.02    -0.03
## revol_bal                 0.33    -0.02        0.31       0.30  0.06     0.21
## revol_util                0.10     0.27        0.12       0.03  0.08    -0.14
## total_acc                 0.21    -0.05        0.19       0.19  0.09     0.68
## mort_acc                  0.22    -0.08        0.19       0.24 -0.03     0.11
## pub_rec_bankruptcies     -0.12     0.05       -0.11      -0.06 -0.02    -0.04
##                      pub_rec revol_bal revol_util total_acc mort_acc
## loan_amnt              -0.09      0.33       0.10      0.21     0.22
## int_rate                0.05     -0.02       0.27     -0.05    -0.08
## installment            -0.08      0.31       0.12      0.19     0.19
## annual_inc             -0.02      0.30       0.03      0.19     0.24
## dti                    -0.02      0.06       0.08      0.09    -0.03
## open_acc               -0.03      0.21      -0.14      0.68     0.11
## pub_rec                 1.00     -0.11      -0.09      0.01     0.01
## revol_bal              -0.11      1.00       0.22      0.18     0.20
## revol_util             -0.09      0.22       1.00     -0.11     0.01
## total_acc               0.01      0.18      -0.11      1.00     0.38
## mort_acc                0.01      0.20       0.01      0.38     1.00
## pub_rec_bankruptcies    0.69     -0.13      -0.10      0.04     0.03
##                      pub_rec_bankruptcies
## loan_amnt                           -0.12
## int_rate                             0.05
## installment                         -0.11
## annual_inc                          -0.06
## dti                                 -0.02
## open_acc                            -0.04
## pub_rec                              0.69
## revol_bal                           -0.13
## revol_util                          -0.10
## total_acc                            0.04
## mort_acc                             0.03
## pub_rec_bankruptcies                 1.00
ggcorrplot(r,
           hc.order = TRUE,
           type = "lower", lab = TRUE)

input <- lend[,c("loan_status","loan_amnt","annual_inc","int_rate","installment", "revol_bal", "revol_util")]
dim(input)
## [1] 358014      7
train <- input[1:286411,]
test <- input[286412:358014,]
lend_glm<-glm(formula = input$loan_status ~ input$loan_amnt + 
                 input$annual_inc + input$int_rate + input$installment + input$revol_bal + 
                 input$revol_util, family = binomial, data = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(lend_glm)
## 
## Call:
## glm(formula = input$loan_status ~ input$loan_amnt + input$annual_inc + 
##     input$int_rate + input$installment + input$revol_bal + input$revol_util, 
##     family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.4904   0.3861   0.5473   0.6909   1.6076  
## 
## Coefficients:
##                     Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)        3.175e+00  1.900e-02  167.097   <2e-16 ***
## input$loan_amnt   -6.608e-05  1.634e-06  -40.436   <2e-16 ***
## input$annual_inc   5.664e-06  1.412e-07   40.111   <2e-16 ***
## input$int_rate    -1.255e-01  1.021e-03 -122.916   <2e-16 ***
## input$installment  1.713e-03  5.475e-05   31.283   <2e-16 ***
## input$revol_bal    5.856e-07  2.850e-07    2.054     0.04 *  
## input$revol_util  -3.144e-03  1.959e-04  -16.049   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 359616  on 358013  degrees of freedom
## Residual deviance: 334527  on 358007  degrees of freedom
## AIC: 334541
## 
## Number of Fisher Scoring iterations: 5
lend_lm1<-lm(formula = input$loan_amnt ~ input$loan_status + 
                 input$annual_inc + input$int_rate + input$installment + 
                 input$revol_bal + input$revol_util,data =train)
summary(lend_lm1)
## 
## Call:
## lm(formula = input$loan_amnt ~ input$loan_status + input$annual_inc + 
##     input$int_rate + input$installment + input$revol_bal + input$revol_util, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -31159.8  -1252.1   -576.0    295.9  13631.8 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5.370e+02  1.961e+01   27.39   <2e-16 ***
## input$loan_statusFully Paid -4.219e+02  1.053e+01  -40.06   <2e-16 ***
## input$annual_inc             2.917e-03  7.298e-05   39.97   <2e-16 ***
## input$int_rate               3.864e+01  9.879e-01   39.12   <2e-16 ***
## input$installment            3.121e+01  1.803e-02 1730.77   <2e-16 ***
## input$revol_bal              1.315e-02  2.154e-04   61.07   <2e-16 ***
## input$revol_util            -1.042e+01  1.823e-01  -57.18   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2445 on 358007 degrees of freedom
## Multiple R-squared:  0.9149, Adjusted R-squared:  0.9149 
## F-statistic: 6.419e+05 on 6 and 358007 DF,  p-value: < 2.2e-16
lend_lm2<-lm(formula = input$loan_amnt~ input$revol_bal+ 
               input$annual_inc + input$installment, data =train)
summary(lend_lm2)
## 
## Call:
## lm(formula = input$loan_amnt ~ input$revol_bal + input$annual_inc + 
##     input$installment, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26291.5  -1243.4   -668.1    143.1  14080.7 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.932e+02  8.770e+00   22.02   <2e-16 ***
## input$revol_bal   1.010e-02  2.116e-04   47.74   <2e-16 ***
## input$annual_inc  2.616e-03  7.302e-05   35.82   <2e-16 ***
## input$installment 3.132e+01  1.788e-02 1751.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2466 on 358010 degrees of freedom
## Multiple R-squared:  0.9135, Adjusted R-squared:  0.9135 
## F-statistic: 1.26e+06 on 3 and 358010 DF,  p-value: < 2.2e-16